Introduction

This script reproduces the differential expression analysis presented in the paper Roux et al. 

Due to patient privacy concerns patient metadata (used for Figure 1 and S1) and raw sequencing data cannot be shared publicly. The analysis code below starts from the read count matrix and associated sample and gene info files available from GEO (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE273902)

TO DO: add title and link to BioRXiv! TO DO: somehow the GO network plot is not reproduced perfectly, weird because I can reproduce it in my analysis script…

Load libraries

library(tidyverse)

## Gene annotation
library(ensembldb)
library(AnnotationHub)
library(org.Hs.eg.db)
library(RSQLite)

## PCA
library(SingleCellExperiment)
library(scater)

## Differential expression
library(limma)
library(edgeR)

## GSEA
library(GO.db)
library(igraph)
library(ggraph)

## TF activity 
library(decoupleR)

## Ligands/receptors
library(CellChat) 

## Differential abundance
library(xCell)

## Comparison to external datasets
library(GEOquery)
library(affy)
library(illuminaHumanv2.db)
library(illuminaHumanv3.db)
library(hgu133plus2.db)

## Plotting
library(ggplot2)
library(ggExtra)
library(ggrepel)
library(ggpubr)
myTheme <- theme_bw(base_size = 20) +
  theme(panel.grid.major = element_line(colour = "lightgrey", linewidth = 0.2), 
        panel.grid.minor = element_line(colour = "lightgrey", linewidth = 0.05),
        legend.position="right", legend.direction = "vertical",
        legend.text=element_text(size=10),
        axis.title=element_text(size=20), 
        axis.text=element_text(size=10), 
        panel.spacing.x = unit(1.2, "lines"))
myPalette <-  c('#e6194b', '#4363d8', '#3cb44b', '#984EA3', '#f58231', '#ffe119', '#F781BF', '#808080', '#98BFDB', '#bcf60c', '#008080', '#e6beff', '#E5C494', '#000075', '#CD00CD', '#aaffc3', '#808000', '#9a6324', '#fffac8', '#800000', '#000000', '#ffffff')

# Useful for downloads
options(timeout=300)

## Save session info to show which versions of packages were used
library(devtools)
capture.output(session_info(), file="session_info.txt") 

# rmarkdown::render("Roux_et_al_bariatric_surgery_paper.Rmd")

Read-in and subset data

## Matrix of read counts
tab <- read.table(file = "data/GSE273902_Counts_Table.tsv.gz", sep = "\t", h=T)

## Info on samples
metadata <- read.table(file = "data/GSE273902_Sample_Info.tsv.gz", sep = "\t", h=T, quote = "")
metadata$Sample <- row.names(metadata)

## Info on genes
genes <- read.table(file = "data/GSE273902_Gene_Info.tsv.gz", sep = "\t", h=T, quote = "")
row.names(genes) <- genes$GENEID

## Build DGEList object
dge <- DGEList(counts = tab, 
               samples = metadata, 
               genes = genes)

## If additional information is needed for the genes, it can be retrieved with EnsemblDB, for example
ah <- AnnotationHub()
## snapshotDate(): 2023-10-23
ahDb <- query(ah, paste("Ensembl 102 EnsDb for Homo sapiens"))
edb <- ahDb[[1]]
## loading from cache
# dge$genes$SYMBOL <- mapIds(edb, key=dge$genes$GENEID, keytype="GENEID", column="SYMBOL", multiVals = "first")

## Calculate size factors
dge <- calcNormFactors(dge) 

## Filter DGEList to keep paired samples (visits 1 and 4)
patient_to_keep <- names(which(rowSums(table(dge$samples$Patient, dge$samples$Visit)[, c("V1", "V4")] != 0) == 2))
sample_to_keep <- dge$samples$Visit %in% c("V1", "V4") & dge$samples$Patient %in% patient_to_keep
dge.subset <- dge[, sample_to_keep]

## Clean-up samples variables
dge.subset$samples$Patient <- factor(dge.subset$samples$Patient)
dge.subset$samples$Visit <- factor(dge.subset$samples$Visit)
table(dge.subset$samples$Visit)
## 
## V1 V4 
## 24 24
dge.subset$samples$Sex <- factor(dge.subset$samples$Sex)
table(dge.subset$samples$Sex)
## 
## female   male 
##     28     20
dge.subset$samples$Patient.subgroup <- factor(dge.subset$samples$Patient.subgroup)
table(dge.subset$samples$Patient.subgroup)
## 
## Insulin dependent T2D / No remission                          No diabetes                   T2D / No remission 
##                                    8                                   16                                    8 
##                      T2D / Remission                              Unknown 
##                                   14                                    2
dge.subset$samples$Surgery.type <- factor(dge.subset$samples$Surgery.type)
## Simplify variable by pooling "gastric band" and "revision"
levels(dge.subset$samples$Surgery.type) <- c("other", "gastric bypass", "gastric sleeve", "other")
## Reorder
dge.subset$samples$Surgery.type <- factor(dge.subset$samples$Surgery.type, levels=levels(dge.subset$samples$Surgery.type)[c(2,3,1)])
table(dge.subset$samples$Surgery.type)
## 
## gastric bypass gastric sleeve          other 
##             22             18              8
## Filter genes that are lowly expressed 
keep <- rowSums(edgeR::cpm(dge.subset) > 1) >= 12 ## 1/4 of the samples
## Filter some gene biotypes, mostly pseudogenes
keep <- keep & (!dge.subset$genes$GENEBIOTYPE %in% c("pseudogene", "processed_pseudogene", "IG_C_pseudogene", 
                                                     "IG_D_pseudogene", "IG_pseudogene", "IG_V_pseudogene", "IG_J_pseudogene",
                                                     "TR_J_pseudogene", "TR_V_pseudogene", "transcribed_processed_pseudogene", 
                                                     "transcribed_unitary_pseudogene", "transcribed_unprocessed_pseudogene", 
                                                     "translated_processed_pseudogene", "translated_unprocessed_pseudogene", 
                                                     "unitary_pseudogene", "rRNA_pseudogene", 
                                                     "unprocessed_pseudogene", "polymorphic_pseudogene", "TEC") &
                  !is.na(dge.subset$genes$GENEBIOTYPE))
sum(keep) 
## [1] 14358
## Subset, while updating total counts
dge.subset <- dge.subset[keep, , keep.lib.sizes=FALSE]
## Redo TMM normalization
dge.subset <- calcNormFactors(dge.subset)

## Calculate CPMs
log2cpm.loess <- normalizeBetweenArrays(edgeR::cpm(dge.subset, log=TRUE, prior.count=8, normalized.lib.sizes=TRUE), method = "cyclicloess")
plotDensities(log2cpm.loess, group=dge.subset$samples$groupSimple, col=myPalette, legend = FALSE)

Figure 2

Principal component analysis

## Perform PCA on the 500 genes with largest inter-quartile range
iqrs <- apply(log2cpm.loess, 1, IQR)
sel <- iqrs >= sort(iqrs, decreasing = T)[500]
pca1 <- prcomp(t(log2cpm.loess[sel,]), scale = T)
## Variance explained by top PCs
summary(pca1)$importance[, 1:10]
##                             PC1      PC2      PC3      PC4      PC5      PC6     PC7      PC8      PC9    PC10
## Standard deviation     13.55696 7.293266 6.131549 4.835588 4.270433 4.192822 3.90843 3.592951 3.390226 3.32974
## Proportion of Variance  0.36758 0.106380 0.075190 0.046770 0.036470 0.035160 0.03055 0.025820 0.022990 0.02217
## Cumulative Proportion   0.36758 0.473970 0.549160 0.595920 0.632400 0.667560 0.69811 0.723930 0.746910 0.76909
## Plotting
myDf <- cbind(pca1$x, dge.subset$samples)
f2p1 <- ggplot(myDf, aes(x=PC1, y=PC2, color=Visit, shape=Surgery.type)) +
  geom_point(size=4) +
  xlab(paste("PC1 (", round(summary(pca1)$importance[2,1],3)*100, "% variance)", sep="")) +
  ylab(paste("PC2 (", round(summary(pca1)$importance[2,2],3)*100, "% variance)", sep="")) +
  scale_color_manual(labels=c("Visit 1", "Visit 4"), values=myPalette[c(2,1)]) + ## v4 = red (up in DE analysis)
  scale_shape_manual(values=c(15:17)) + 
  myTheme +
  theme(legend.position="bottom")
## Figure 2A
f2p1 <- ggMarginal(f2p1, groupColour = TRUE, groupFill = TRUE)
plot(f2p1)

## Other interesting plots, not shown in the paper
## PC3 vs. 4 colored by sex
f <- ggplot(myDf, aes(x=PC3, y=PC4, color=Sex, shape=Visit)) +
  geom_point(size=4) +
  xlab(paste("PC3 (", round(summary(pca1)$importance[2,3],3)*100, "% variance)", sep="")) +
  ylab(paste("PC4 (", round(summary(pca1)$importance[2,4],3)*100, "% variance)", sep="")) +
  scale_color_manual(values=myPalette[3:4]) +
  scale_shape_manual(values=15:16) +
  myTheme +
  theme(legend.position="bottom")
f <- ggMarginal(f, groupColour = TRUE, groupFill = TRUE)
plot(f)

f <- ggplot(myDf[myDf$Patient.subgroup != "Unknown", ], 
            aes(x=PC7, y=PC9, color=Patient.subgroup, shape=Visit)) +
  geom_point(size=4) +
  xlab(paste("PC7 (", round(summary(pca1)$importance[2,7],3)*100, "% variance)", sep="")) +
  ylab(paste("PC9 (", round(summary(pca1)$importance[2,9],3)*100, "% variance)", sep="")) +
  scale_color_manual(name="Patient subgroup",
                     values=setNames(myPalette[c(9,5,6,7,8)], c("No diabetes", "Insulin dependent T2D / No remission", "T2D / No remission", "T2D / Remission", "Unknown"))) +
  scale_shape_manual(values=15:16) +
  myTheme  + 
  theme(legend.position="bottom") +
  guides(color=guide_legend(ncol=2))
f <- ggMarginal(f, groupColour = TRUE, groupFill = TRUE)
plot(f)

## Percentage of variance of each PC explained by different factors
## Put normalized data into a SingleCellExperiment object to be used in function getExplanatoryPCs
sce <- SingleCellExperiment(assays=list(logcounts=log2cpm.loess),
                                                  reducedDims = list(PCA=pca1$x),
                                                  colData=dge.subset$samples) 
percentVar <- getExplanatoryPCs(sce, 
                                n_dimred = 48,
                                dimred = "PCA",
                                variables = c("Visit", "Patient", "Sex", 
                                              "Patient.subgroup", "Surgery.type"))
head(percentVar, n=10)
##           Visit  Patient        Sex Patient.subgroup Surgery.type
## PC1  23.3007512 39.25754  1.2461776         6.597526    9.2467516
## PC2   0.3629822 65.29105 19.6800008         9.433735    8.3294502
## PC3   0.9991333 82.42652 23.2322872        29.273784    6.1749170
## PC4   4.4526713 82.20814 33.3452480         9.816160    1.7559173
## PC5   0.1148688 91.30424  9.0065640         3.770498   10.1595481
## PC6   0.3382346 85.61496  0.8068154        12.051550    2.1040895
## PC7   4.2925295 76.05769  5.3417397        17.572582    0.5431932
## PC8   0.6013340 78.83051  0.2964702        10.028197    9.0705173
## PC9   0.1353582 91.01690  1.3315236        19.424360    9.0489063
## PC10  0.1108903 95.52493  1.5439451        15.080473    1.8282309
## Plotting
myDf <- data.frame(PC = rep(seq_len(10), ncol(percentVar)), 
                   Factor = rep(factor(colnames(percentVar), levels = c("Patient", "Visit", "Sex", "Surgery.type", "Patient.subgroup")), each = 10), 
                   Pct_var_explained = as.numeric(percentVar[1:10, ]), 
                   check.names = FALSE)
f2p2 <- ggplot(myDf, aes(x = PC, y = Pct_var_explained, fill = Factor)) + 
  geom_bar(stat="identity", width = 0.7, position=position_dodge2(padding=0.05)) + 
  scale_fill_manual(values = myPalette[c(10, 1, 3, 13, 7)]) +
  xlab("Principal Component") +
  ylab("% variance explained") + 
  coord_cartesian(ylim = c(0, 100), expand = FALSE) + 
  scale_x_continuous(breaks = c(1:10)) +
  myTheme + 
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(), 
        legend.position="bottom") +
  guides(fill=guide_legend(ncol=3))  
## Figure 2B
f2p2

Differential expression analysis

## design matrix 
moma <- model.matrix(~ 0 + Visit + Patient, data=dge.subset$samples)
colnames(moma) <- gsub("Visit", "", colnames(moma)) 

## Contrast matrix
contrasts.matrix <- makeContrasts(
  Post_vs_pre = V4 - V1,
  levels=moma
)

v <- voom(dge.subset, moma, plot=TRUE, normalize.method = "cyclicloess")

fit <- lmFit(v, moma)
fit2 <- contrasts.fit(fit, contrasts.matrix)
fit2 <- eBayes(fit2, robust=TRUE) ## avoids shrinkage of large variances

## Genes DE at FDR 5%
table(de <- decideTests(fit2, p.value = 0.05))
## 
##    -1     0     1 
##   884 13011   463
## Top 10 genes by significance
topTable(fit2, coef=1, p.value=0.05, n=10, sort.by = "P")
##                          GENEID         SYMBOL                                DESCRIPTION    GENEBIOTYPE ENTREZID    UNIPROT LENGTH      logFC
## ENSG00000173585 ENSG00000173585           CCR9             C-C motif chemokine receptor 9 protein_coding    10803     B2R734   2807  1.1444969
## ENSG00000239571 ENSG00000239571      IGKV2D-30        immunoglobulin kappa variable 2D-30      IG_V_gene       NA       <NA>    395  2.0648001
## ENSG00000180537 ENSG00000180537         RNF182                    ring finger protein 182 protein_coding   221687 A0A024QZW5   4098 -1.7403616
## ENSG00000095932 ENSG00000095932         SMIM24         small integral membrane protein 24 protein_coding   284422     O75264   1481 -1.8153217
## ENSG00000132465 ENSG00000132465         JCHAIN    joining chain of multimeric IgA and IgM protein_coding     3512     P01591   1546  1.9230039
## ENSG00000163554 ENSG00000163554          SPTA1             spectrin alpha, erythrocytic 1 protein_coding     6708     P02549  10108 -1.3947816
## ENSG00000259040 ENSG00000259040 BLOC1S5-TXNDC5 BLOC1S5-TXNDC5 readthrough (NMD candidate) protein_coding       NA       <NA>   3030  0.8814465
## ENSG00000239264 ENSG00000239264         TXNDC5            thioredoxin domain containing 5 protein_coding    81567 A0A024QZV0   4361  0.8878952
## ENSG00000241294 ENSG00000241294       IGKV2-24         immunoglobulin kappa variable 2-24      IG_V_gene       NA       <NA>    390  1.9632546
## ENSG00000048462 ENSG00000048462       TNFRSF17         TNF receptor superfamily member 17 protein_coding      608     Q02223   1015  1.6891936
##                    AveExpr         t      P.Value   adj.P.Val        B
## ENSG00000173585  0.6221627  7.137526 9.635113e-08 0.001383410 7.163198
## ENSG00000239571  1.4115440  6.691342 3.054725e-07 0.001831252 6.552504
## ENSG00000180537 -1.0279598 -6.554139 4.374899e-07 0.001831252 6.100028
## ENSG00000095932  1.4076978 -6.495659 5.101692e-07 0.001831252 6.183886
## ENSG00000132465  5.9339808  6.117238 1.390180e-06 0.003992041 5.341951
## ENSG00000163554 -0.3731375 -5.952851 2.157171e-06 0.005162110 4.489506
## ENSG00000259040  5.8591142  5.773396 3.493046e-06 0.006232159 4.480450
## ENSG00000239264  5.8351068  5.748790 3.732339e-06 0.006232159 4.418202
## ENSG00000241294  1.8923204  5.696797 4.293790e-06 0.006232159 4.214713
## ENSG00000048462  0.6441979  5.692261 4.346637e-06 0.006232159 3.857395
## Top 10 genes by logFC
topTable(fit2, coef=1, p.value=0.05, n=10, sort.by = "logFC")
##                          GENEID     SYMBOL                             DESCRIPTION    GENEBIOTYPE ENTREZID    UNIPROT LENGTH     logFC    AveExpr
## ENSG00000239571 ENSG00000239571  IGKV2D-30     immunoglobulin kappa variable 2D-30      IG_V_gene       NA       <NA>    395  2.064800  1.4115440
## ENSG00000241294 ENSG00000241294   IGKV2-24      immunoglobulin kappa variable 2-24      IG_V_gene       NA       <NA>    390  1.963255  1.8923204
## ENSG00000275527 ENSG00000275527 AC100835.2                        novel transcript         lncRNA       NA       <NA>    479 -1.956837  0.3429435
## ENSG00000167992 ENSG00000167992       VWCE von Willebrand factor C and EGF domains protein_coding   220001     Q96DN2   4766 -1.925803  2.5032452
## ENSG00000132465 ENSG00000132465     JCHAIN joining chain of multimeric IgA and IgM protein_coding     3512     P01591   1546  1.923004  5.9339808
## ENSG00000095932 ENSG00000095932     SMIM24      small integral membrane protein 24 protein_coding   284422     O75264   1481 -1.815322  1.4076978
## ENSG00000243238 ENSG00000243238   IGKV2-30      immunoglobulin kappa variable 2-30      IG_V_gene       NA       <NA>    390  1.792634  3.0282818
## ENSG00000130300 ENSG00000130300      PLVAP  plasmalemma vesicle associated protein protein_coding    83483     Q9BX97   2295 -1.769098  1.2712457
## ENSG00000243264 ENSG00000243264  IGKV2D-29     immunoglobulin kappa variable 2D-29      IG_V_gene       NA       <NA>    400  1.742866  0.1837183
## ENSG00000180537 ENSG00000180537     RNF182                 ring finger protein 182 protein_coding   221687 A0A024QZW5   4098 -1.740362 -1.0279598
##                         t      P.Value   adj.P.Val           B
## ENSG00000239571  6.691342 3.054725e-07 0.001831252  6.55250410
## ENSG00000241294  5.696797 4.293790e-06 0.006232159  4.21471264
## ENSG00000275527 -5.639226 5.015494e-06 0.006232159  3.91406109
## ENSG00000167992 -5.468556 7.957993e-06 0.007928524  3.70020259
## ENSG00000132465  6.117238 1.390180e-06 0.003992041  5.34195144
## ENSG00000095932 -6.495659 5.101692e-07 0.001831252  6.18388602
## ENSG00000243238  5.359157 1.070610e-05 0.008055343  3.42466559
## ENSG00000130300 -5.407964 9.378410e-06 0.007928524  3.48492055
## ENSG00000243264  3.949592 4.872994e-04 0.022485176 -0.06752678
## ENSG00000180537 -6.554139 4.374899e-07 0.001831252  6.10002830
## Diagnostic plots
plotSA(fit2) 

top <- topTable(fit2, coef=1, p.value=1, n=Inf, sort.by = "none")
hist(top$P.Value, n=100)

Build up list of ligands and receptors

## CellChatDB ##################################################################
CellChatDB <- CellChatDB.human 
## Extract all ligands
cellchat_ligands <- CellChatDB$interaction |> 
  dplyr::filter(annotation == "Secreted Signaling") |> 
  dplyr::left_join(y = CellChatDB$geneInfo, by = join_by(ligand == Symbol)) |>
  dplyr::select(ligand) |> 
  distinct() |> 
  dplyr::filter(ligand != "") |>
  unlist()
## Add complexes
CellChatDB.complex <- CellChatDB$complex |> 
  as_tibble(rownames = "Complex") |> 
  pivot_longer(!Complex, names_to = "Subunit", values_to = "Symbol") |> 
  dplyr::filter(Symbol != "") |> 
  distinct()
cellchat_ligands <- c(cellchat_ligands,
                      CellChatDB$interaction |> 
                        dplyr::filter(annotation == "Secreted Signaling") |> 
                        dplyr::left_join(y = CellChatDB.complex, 
                                  by = join_by(ligand == Complex), 
                                  relationship = "many-to-many") |> 
                        dplyr::filter(!is.na(Symbol)) |> 
                        dplyr::left_join(y = CellChatDB$geneInfo, by = join_by(Symbol == Symbol)) |>
                        dplyr::select(Symbol) |> 
                        dplyr::filter(Symbol != "") |>
                        distinct() |> 
                        unlist()) |>
  unique() 

# Same for receptors
cellchat_receptors <- CellChatDB$interaction |> 
  dplyr::filter(annotation == "Secreted Signaling") |> 
  dplyr::left_join(y = CellChatDB$geneInfo, by = join_by(receptor == Symbol)) |>
  dplyr::select(receptor) |> 
  distinct() |> 
  dplyr::filter(receptor != "") |>
  unlist()
## Add complexes
cellchat_receptors <- c(cellchat_receptors,
                        CellChatDB$interaction |> 
                          dplyr::filter(annotation == "Secreted Signaling") |> 
                          dplyr::left_join(y = CellChatDB.complex, 
                                    by = join_by(receptor == Complex), 
                                    relationship = "many-to-many") |> 
                          dplyr::filter(!is.na(Symbol)) |> 
                          dplyr::left_join(y = CellChatDB$geneInfo, by = join_by(Symbol == Symbol)) |>
                          dplyr::select(Symbol) |> 
                          dplyr::filter(Symbol != "") |>
                          distinct() |> 
                          unlist()) |>
  unique() 


## CellTalkDB ##################################################################
CellTalkDB <- readRDS(url("https://github.com/ZJUFanLab/CellTalkDB/raw/f33e0e6dd3c143193d593b5dad8c613280b94699/database/human_lr_pair.rds"))

## Let's convert protein IDs to gene IDs using EnsemblDB
celltalk_ligands <- mapIds(edb, key=unique(CellTalkDB$ligand_ensembl_protein_id), keytype="PROTEINID", column="GENEID") 
## Warning: Unable to map 21 of 816 requested IDs.
celltalk_ligands <- celltalk_ligands[!is.na(celltalk_ligands)]

## Same for receptors
celltalk_receptors <- mapIds(edb, key=unique(CellTalkDB$receptor_ensembl_protein_id), keytype="PROTEINID", column="GENEID") 
## Warning: Unable to map 14 of 781 requested IDs.
celltalk_receptors <- celltalk_receptors[!is.na(celltalk_receptors)]


## CellPhoneDB #################################################################
CellPhoneDB <- read.csv(file = 'https://raw.githubusercontent.com/ventolab/cellphonedb-data/753d63da75dc11730edc0c4b01fbc9100584f77d/data/interaction_input.csv')
complex_input <- read.csv(file = 'https://raw.githubusercontent.com/ventolab/cellphonedb-data/753d63da75dc11730edc0c4b01fbc9100584f77d/data/complex_input.csv', row.names = 1)
geneInfo <- read.csv(file = 'https://raw.githubusercontent.com/ventolab/cellphonedb-data/753d63da75dc11730edc0c4b01fbc9100584f77d/data/gene_input.csv')

## process similar to CellChatDB
cellphone_ligands <- CellPhoneDB |> 
  dplyr::filter(directionality == "Ligand-Receptor") |> 
  dplyr::left_join(y = geneInfo, 
            by = join_by(partner_a == uniprot), 
            relationship = "many-to-many") |>
  dplyr::select(ensembl) |> 
  distinct() |> 
  dplyr::filter(ensembl != "") |>
  unlist() 
## Add complexes
cellphoneDB.complex <- complex_input |> 
  as_tibble(rownames = "Complex") |> 
  pivot_longer(cols = starts_with("uniprot_"), names_to = "Subunit", values_to = "uniprot") |> 
  dplyr::filter(uniprot != "") |> 
  distinct() |> 
  dplyr::select(Complex, uniprot)
cellphone_ligands <- c(cellphone_ligands,
                      CellPhoneDB |> 
                        dplyr::filter(directionality == "Ligand-Receptor") |> 
                        dplyr::left_join(y = cellphoneDB.complex, 
                                  by = join_by(partner_a == Complex),
                                  relationship = "many-to-many") |> 
                        dplyr::filter(!is.na(uniprot)) |> 
                        dplyr::left_join(y = geneInfo, 
                                  by = join_by(uniprot == uniprot), 
                                  relationship = "many-to-many") |>
                        dplyr::select(ensembl) |> 
                        dplyr::filter(ensembl != "") |>
                        distinct() |> 
                        unlist()) |>
  unique() 

## receptors
cellphone_receptors <- CellPhoneDB |> 
  dplyr::filter(directionality == "Ligand-Receptor") |> 
  dplyr::left_join(y = geneInfo, 
            by = join_by(partner_b == uniprot),
            relationship = "many-to-many") |>
  dplyr::select(ensembl) |> 
  distinct() |> 
  dplyr::filter(ensembl != "") |>
  unlist() 
## Add complexes
cellphone_receptors <- c(cellphone_receptors,
                      CellPhoneDB |> 
                        dplyr::filter(directionality == "Ligand-Receptor") |> 
                        dplyr::left_join(y = cellphoneDB.complex, 
                                  by = join_by(partner_b == Complex),
                                  relationship = "many-to-many") |> 
                        dplyr::filter(!is.na(uniprot)) |> 
                        dplyr::left_join(y = geneInfo, 
                                  by = join_by(uniprot == uniprot),
                                  relationship = "many-to-many") |>
                        dplyr::select(ensembl) |> 
                        dplyr::filter(ensembl != "") |>
                        distinct() |> 
                        unlist()) |>
  unique() 


## Merge 3 databases together, keeping only genes in dge.subset ################
all_ligands <- unique(c(dge.subset$genes$GENEID[dge.subset$genes$SYMBOL %in% cellchat_ligands], 
                        dge.subset$genes$GENEID[dge.subset$genes$GENEID %in% celltalk_ligands], 
                        dge.subset$genes$GENEID[dge.subset$genes$GENEID %in% cellphone_ligands]))
all_receptors <- unique(c(dge.subset$genes$GENEID[dge.subset$genes$SYMBOL %in% cellchat_receptors], 
                          dge.subset$genes$GENEID[dge.subset$genes$GENEID %in% celltalk_receptors], 
                          dge.subset$genes$GENEID[dge.subset$genes$GENEID %in% cellphone_receptors]))
## When overlap between ligand and receptor list, keep genes only as ligand
all_receptors <- all_receptors[!all_receptors %in% all_ligands]

## Save in case some resources become unavailable
# saveRDS(all_ligands, "data/ligands_combined.rds")
# saveRDS(all_receptors, "data/receptors_combined.rds")

Figures 3A and S2A

MA plots

# all_ligands <- readRDS("data/ligands_combined.rds")
# all_receptors <- readRDS("data/receptors_combined.rds")
myDf <- top
myDf$Significant <- FALSE
myDf$Significant[myDf$adj.P.Val <= 0.05 & myDf$logFC > 0] <- "Up"
myDf$Significant[myDf$adj.P.Val <= 0.05 & myDf$logFC < 0] <- "Down"
myDf$Significant <- factor(myDf$Significant, levels=c("Up", "Down", FALSE))
## Sort points so that significant points are plotted last
myDf <- myDf[order(myDf$Significant, decreasing = T), ]

f <- ggplot(myDf, aes(x=AveExpr, y=logFC)) + 
  ylab(bquote('log'[2]~'Fold-Change')) +
  xlab("Average expression") +
  geom_point(aes(color=Significant, alpha=Significant), shape=16) +
  scale_color_manual(name="Significant",  
                     limits=c("Up", "Down"), 
                     values=unname(c(myPalette[1], myPalette[2])),
                     na.value = "grey80") +
  scale_alpha_manual(values=c(1,1,0.6), guide="none") +
  myTheme + 
  theme(legend.position="bottom")

## Label ligands
myDf$ligand <- myDf$SYMBOL
myDf$ligand[myDf$adj.P.Val > 0.05 | !myDf$GENEID %in% all_ligands] <- ""
f3p1 <- f + geom_label_repel(data = myDf[myDf$ligand != "", ],
                   aes(x=AveExpr, y=logFC),
                   label=myDf[myDf$ligand != "", ]$ligand,
                   min.segment.length = unit(0, 'lines'), # always draw segment pointing to the gene
                   size=3,
                   color="black",
                   direction = "both",
                   point.padding = 0.1,
                   box.padding = 0.3,
                   max.overlaps = 25, 
                   seed=1234) 
f3p1
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider increasing max.overlaps

## Label receptors
myDf$receptor <- myDf$SYMBOL
myDf$receptor[myDf$adj.P.Val > 0.05 | !myDf$GENEID %in% all_receptors] <- ""
fs2p1 <- f + geom_label_repel(data = myDf[myDf$receptor != "", ],
                   aes(x=AveExpr, y=logFC),
                   label=myDf[myDf$receptor != "", ]$receptor,
                   min.segment.length = unit(0, 'lines'), 
                   size=3,
                   color="black",
                   direction = "both",
                   point.padding = 0.1,
                   box.padding = 0.3,
                   max.overlaps = 25, 
                   seed=1234) 
fs2p1

Figure S2B

Boxplots showing normalized expression of top DE genes

top <- topTable(fit2, coef=1, p.value=0.01, n=Inf, sort.by = "P")
# Remove 2 genes because they have identical sequences (same reads are counted)
top <- top[!top$SYMBOL %in% c("BLOC1S5-TXNDC5", # same sequence as TXNDC5
                              "IGKV2-30"), ]    # same sequence as IGKV2D-30

myDf <- log2cpm.loess[top$GENEID,] |>
  as_tibble(rownames = NA) |> 
  rownames_to_column() |>
  dplyr::rename(Gene = rowname) |>
  pivot_longer(cols= colnames(log2cpm.loess), 
               names_to = "Sample",
               values_to = "log2CPM") |> 
  dplyr::left_join(y=rownames_to_column(as_tibble(dge.subset$samples)), by = join_by("Sample")) |>
  dplyr::left_join(y=as_tibble(dge.subset$genes), by=join_by("Gene" == "GENEID"))
## Reorder subgroups
myDf$Patient.subgroup <- factor(myDf$Patient.subgroup, levels = c("No diabetes", "Insulin dependent T2D / No remission", "T2D / No remission", "T2D / Remission", "Unknown"))

## Order of genes: significance + separate up (first) and down-regulated genes (then)
myDf$SYMBOL <- factor(myDf$SYMBOL, levels=c(top[top$logFC > 0, ]$SYMBOL, top[top$logFC < 0, ]$SYMBOL))

## Same plot colored by subgroup (see CCR9 with higher expression in no_diabetes)
fs2p2 <- ggplot(myDf, aes(x=Visit, y=log2CPM, group=Patient, col=Patient.subgroup)) +
  facet_wrap( ~ SYMBOL, scales = "free_y", ncol = 5) +
  geom_line(position = position_dodge(0.2), alpha = .5) +
  geom_point(position = position_dodge(0.2), alpha = .8, size=2) +
  scale_colour_manual(name="Patient subgroup", values=setNames(myPalette[c(9,5,6,7,8)], c("No diabetes", "Insulin dependent T2D / No remission", "T2D / No remission", "T2D / Remission", "unknown"))) +
  ylab(bquote('log'[2]~'CPM')) +
  xlab("") +
  myTheme + 
  theme(strip.background = element_blank(), 
        strip.text.x = element_text(size = 14, vjust = 0, face = "bold"),
        strip.clip = "off",
        panel.spacing.x = unit(3, "points"), 
        panel.spacing.y = unit(0, "points"), 
        axis.text.y = element_text(size = 8), 
        legend.position="bottom")
fs2p2

Figure 3B

Transcription factor activity inference

We obtained the TF to target map from the CollecTRI collection. We used the decoupleR package version 2.9.7 (Bioconductor 3.18), the mapping with more recent versions might differ, so below we read directly from a saved object.

# net <- get_collectri(organism='human', split_complexes=FALSE)
# saveRDS(net, file="data/CollecTRI.rds") 
net <- readRDS(file="data/CollecTRI.rds") 

# We infer TF activities from the t-values of the DEGs between v4 and v1
top <- topTable(fit2, n=Inf, p=1, sort.by = "none")[!duplicated(dge.subset$genes$SYMBOL), ]
row.names(top) <- top$SYMBOL
contrast_acts <- run_ulm(mat=top[, 't', drop=FALSE], net=net, minsize = 10)

## Perform FDR correction and keep significant TFs
f_contrast_acts <- contrast_acts |> 
  mutate(FDR = p.adjust(contrast_acts$p_value, method="BH")) |> 
  dplyr::filter(FDR < 0.1) |> 
  arrange(score) 
f_contrast_acts
## # A tibble: 54 × 6
##    statistic source condition score  p_value      FDR
##    <chr>     <chr>  <chr>     <dbl>    <dbl>    <dbl>
##  1 ulm       SP1    t         -8.05 8.84e-16 3.72e-13
##  2 ulm       GATA1  t         -6.37 1.99e-10 4.18e- 8
##  3 ulm       ESR2   t         -4.77 1.84e- 6 2.58e- 4
##  4 ulm       GATA2  t         -4.37 1.23e- 5 1.04e- 3
##  5 ulm       SMAD5  t         -4.37 1.24e- 5 1.04e- 3
##  6 ulm       ESR1   t         -4.11 4.03e- 5 2.83e- 3
##  7 ulm       EGR1   t         -4.00 6.28e- 5 3.78e- 3
##  8 ulm       RUNX1  t         -3.93 8.54e- 5 4.49e- 3
##  9 ulm       SP3    t         -3.90 9.82e- 5 4.59e- 3
## 10 ulm       HIF1A  t         -3.81 1.42e- 4 5.68e- 3
## # ℹ 44 more rows
## Plot for paper
f3p2 <- ggplot(f_contrast_acts, aes(x = reorder(source, score), y = score)) + 
  geom_bar(aes(fill = score), stat = "identity") +
  xlab("") + 
  scale_fill_gradient2(low = myPalette[2], high = myPalette[1], mid = "white", midpoint = 0) + 
  myTheme + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.major.x = element_blank(), 
        panel.grid.minor.x = element_blank()) 
f3p2

GSEA

Obtain Reactom and Wikipathways pathways

We download the pathways from the MSigDB version 2023.2.Hs database, available on the release archive

if(!file.exists("data/msigdb_v2023.2.Hs.db")){
  download.file("https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2023.2.Hs/msigdb_v2023.2.Hs.db.zip", destfile = "data/msigdb_v2023.2.Hs.db.zip")
  unzip("data/msigdb_v2023.2.Hs.db.zip", exdir = "data/")
}

## connect to db
con <- dbConnect(drv=SQLite(), dbname="data/msigdb_v2023.2.Hs.db")
temp <- dbGetQuery(conn=con, statement="SELECT gene_set.standard_name, gene_set.collection_name, gene_symbol.symbol
                                        FROM gene_set
                                        LEFT JOIN gene_set_gene_symbol
                                           ON gene_set.id = gene_set_gene_symbol.gene_set_id
                                        LEFT JOIN gene_symbol
                                           ON gene_symbol.id = gene_set_gene_symbol.gene_symbol_id
                                        WHERE gene_set.collection_name='C2:CP:REACTOME' OR gene_set.collection_name='C2:CP:WIKIPATHWAYS'")
table(temp$collection_name)
## 
##     C2:CP:REACTOME C2:CP:WIKIPATHWAYS 
##              95226              33947
## Let's turn this into structured lists, one per collection
collections <- sort(unique(temp$collection_name))
for (f in collections) {
  cat(f, "\n")
  myList <- tapply(temp[temp$collection_name %in% f, ], 
         factor(temp$standard_name[temp$collection_name %in% f]),
         function(x){
           return(dge.subset$genes$GENEID[dge.subset$genes$SYMBOL %in% unique(x$symbol)]) 
         })   
  myCollection <- paste0(make.names(f), ".CATEGORY.v2023.2.Hs.ensembl.hsa")
  assign(myCollection, myList)
}
## C2:CP:REACTOME 
## C2:CP:WIKIPATHWAYS
rm(temp)
dbDisconnect(con)

Obtain Gene Ontology gene sets

For the paper we obtained the GO mapping from the org.Hs.eg.db annotation package version 3.18. The mapping with other Bioconductor versions might differ, so below we read directly from a saved object in an .rds file.

## Perform the query
# updatedGO <- AnnotationDbi::select(org.Hs.eg.db, keys=dge.subset$genes$GENEID, keytype = "ENSEMBL", columns=c("GOALL", "EVIDENCEALL", "ONTOLOGYALL")) 
# updatedGO <- tapply(updatedGO$ENSEMBL, list(updatedGO$GOALL), unique)
# saveRDS(updatedGO, file="data/GO_mapping.rds") 

updatedGO <- readRDS(file="data/GO_mapping.rds") 

Testing with Roast

This performs a self-contained enrichment test. All results are stored into a list resRoast

# Used below for calculation of average logFC
top <- topTable(fit2, p.value = 1, n=Inf, sort="none")
geneList <- setNames(top$logFC, top$GENEID)

resRoast <- list()
for (f in c(paste0(make.names(collections), ".CATEGORY.v2023.2.Hs.ensembl.hsa"), "updatedGO")) {
  resRoast[[f]] <- list()

  ## process and filter gene sets
  indx <- ids2indices(gene.sets=get(f), identifiers=rownames(fit2$genes))
  indx <- indx[sapply(indx, length) >= 10]
  
  ## enrichment test
  resRoast[[f]] <- fry(log2cpm.loess, indx, moma, contrast = contrasts.matrix, sort = "directional")
  
  ## Add average logFC for every gene set
  resRoast[[f]]$aveLog2FC <- sapply(indx, function(x) mean(geneList[x], na.rm=T))[rownames(resRoast[[f]])]
}
# Add useful info for GO terms
resRoast$updatedGO <- cbind(resRoast$updatedGO, AnnotationDbi::select(GO.db, keys=row.names(resRoast$updatedGO), keytype = "GOID", columns=c("ONTOLOGY", "TERM", "DEFINITION")))
## 'select()' returned 1:1 mapping between keys and columns
# Results used in the paper:
resRoast$updatedGO |> dplyr::select(ONTOLOGY, TERM, DEFINITION, NGenes, Direction, aveLog2FC, FDR) |> head()
##            ONTOLOGY                                                 TERM
## GO:0046716       BP                     muscle cell cellular homeostasis
## GO:0098632       MF                 cell-cell adhesion mediator activity
## GO:0086065       BP    cell communication involved in cardiac conduction
## GO:0009110       BP                         vitamin biosynthetic process
## GO:0042440       BP                            pigment metabolic process
## GO:0031593       MF polyubiquitin modification-dependent protein binding
##                                                                                                                                                                                                                                                                                                                                      DEFINITION
## GO:0046716                                                                                                                                                                                                                            The cellular homeostatic process that preserves a muscle cell in a stable functional or structural state.
## GO:0098632                                                                                                                                                                                    The binding by a cell-adhesion protein on the cell surface to an extracellular matrix component, to mediate adhesion of the cell to another cell.
## GO:0086065 Any process that mediates interactions between a cell and its surroundings that contributes to the process of cardiac conduction. Encompasses interactions such as signaling or attachment between one cell and another cell, between a cell and an extracellular matrix, or between a cell and any other aspect of its environment.
## GO:0009110                                                                     The chemical reactions and pathways resulting in the formation of a vitamin, one of a number of unrelated organic substances that occur in many foods in small amounts and that are necessary in trace amounts for the normal metabolic functioning of the body.
## GO:0042440                                                                                                                                                                                                  The chemical reactions and pathways involving pigment, any general or particular coloring matter in living organisms, e.g. melanin.
## GO:0031593                                                                                                                                                                                                                                                                 Binding to a protein upon poly-ubiquitination of the target protein.
##            NGenes Direction   aveLog2FC        FDR
## GO:0046716     16      Down -0.11044773 0.03155728
## GO:0098632     26      Down -0.17308639 0.03155728
## GO:0086065     22      Down -0.18108434 0.03155728
## GO:0009110     17      Down -0.08775647 0.03155728
## GO:0042440     53      Down -0.15012531 0.03155728
## GO:0031593     54      Down -0.09560465 0.03181636
resRoast$C2.CP.REACTOME.CATEGORY.v2023.2.Hs.ensembl.hsa |> dplyr::select(NGenes, Direction, aveLog2FC, FDR) |> head()
##                                                                          NGenes Direction  aveLog2FC        FDR
## REACTOME_RHO_GTPASES_ACTIVATE_KTN1                                           10      Down -0.1693490 0.01320083
## REACTOME_HEME_BIOSYNTHESIS                                                   13      Down -0.3726414 0.01320083
## REACTOME_ADRENALINE_NORADRENALINE_INHIBITS_INSULIN_SECRETION                 13      Down -0.1630842 0.01626186
## REACTOME_THROMBIN_SIGNALLING_THROUGH_PROTEINASE_ACTIVATED_RECEPTORS_PARS     21      Down -0.1851271 0.01626186
## REACTOME_THROMBOXANE_SIGNALLING_THROUGH_TP_RECEPTOR                          16      Down -0.1838983 0.01626186
## REACTOME_ATF6_ATF6_ALPHA_ACTIVATES_CHAPERONE_GENES                           10        Up  0.1649199 0.01626186
resRoast$C2.CP.WIKIPATHWAYS.CATEGORY.v2023.2.Hs.ensembl.hsa |> dplyr::select(NGenes, Direction, aveLog2FC, FDR) |> head()
##                                                                                                      NGenes Direction   aveLog2FC        FDR
## WP_GLYCOLYSIS_IN_SENESCENCE                                                                              11      Down -0.17272946 0.03348947
## WP_NAD_METABOLISM_IN_ONCOGENE_INDUCED_SENESCENCE_AND_MITOCHONDRIAL_DYSFUNCTION_ASSOCIATED_SENESCENCE     18      Down -0.11546673 0.03348947
## WP_VITAMIN_B12_METABOLISM                                                                                27      Down -0.17558880 0.03348947
## WP_METABOLIC_REPROGRAMMING_IN_PANCREATIC_CANCER                                                          39      Down -0.10933796 0.03348947
## WP_HEMATOPOIETIC_STEM_CELL_DIFFERENTIATION                                                               39      Down -0.27127520 0.03348947
## WP_2Q13_COPY_NUMBER_VARIATION_SYNDROME                                                                   45      Down -0.06125964 0.03348947

Figures 4 and S4

Display the gene overlap between significant gene sets

## Function inspired from enrichplot:::emapplot.enrichResult
prepare_graph <- function(x, ## a roast/camera output: subset to the categories to plot
                          min_edge = 0.1, ## min fraction of overlap
                          pair_sim = NULL, # x@termsim from pairwise_termsim(), or a similarity matrix. row and columns should match row.names of x
                          NGenes = "NGenes", # column with set size info
                          colVar = "aveLog2FC", # column used for coloring
                          ...){
  if (!is.numeric(min_edge) | min_edge < 0 | min_edge > 1) {
    stop('"min_edge" should be a number between 0 and 1.')
  }
  y <- as.data.frame(x)
  y$ID <- row.names(y)
  
  ## Similarity matrix
  w <- as.matrix(pair_sim[row.names(y), row.names(y)])
  ## Pivot
  wd <- w |>
    as_tibble(rownames = "category1") |>
    pivot_longer(cols= colnames(w), 
                 names_to = "category2",
                 values_to = "similarity") |> 
    distinct() |>
    dplyr::filter(category1 != category2) |> 
    dplyr::filter(!is.na(similarity)) |>
    as.data.frame()
  
  ## Make graph
  g <- graph.data.frame(wd[, c("category1", "category2")], directed=FALSE)
  E(g)$width <- wd[, "similarity"] 
  ## Use similarity as the weight(length) of an edge
  E(g)$weight <- wd[, "similarity"]
  ## Delete when overlap is low
  g <- delete.edges(g, E(g)[wd[, "similarity"] < min_edge])
  
  ## Size of vertices
  V(g)$size <- y[V(g)$name, NGenes] 
  ## Value used for coloring of the vertices
  V(g)$color <- y[V(g)$name, colVar]
  
  return(g)
}

## Reactome results ############################################################
myResults <- resRoast$C2.CP.REACTOME.CATEGORY.v2023.2.Hs.ensembl.hsa
## Filter out gene sets that are very large, and keep the significant ones with largest effect size
myResults <- myResults[myResults$FDR < 0.05 & 
                         myResults$NGenes < 100 & 
                         abs(myResults$aveLog2FC) > 0.1, ] 
dim(myResults)
## [1] 69  7
## Build similarity matrix (Jaccard coefficient)
geneSets <- C2.CP.REACTOME.CATEGORY.v2023.2.Hs.ensembl.hsa[row.names(myResults)]
n <- length(geneSets)
w <- matrix(NA, nrow = n, ncol = n)
colnames(w) <- rownames(w) <- names(geneSets)
for (i in seq_len(n - 1)) {
  for (j in (i + 1):n) {
    w[i, j] <-  length(intersect(geneSets[[i]], geneSets[[j]]))/length(unique(c(geneSets[[i]], geneSets[[j]])))
  }
}
## process and simplify names
pair_sim <- w[row.names(myResults), row.names(myResults)]
row.names(myResults) <- gsub("^REACTOME_", "", row.names(myResults))
colnames(pair_sim) <- rownames(pair_sim) <- row.names(myResults)

g <- prepare_graph(myResults,
                   min_edge = 0.1, 
                   pair_sim = pair_sim,
                   NGenes = "NGenes", 
                   colVar = "aveLog2FC", 
                   layout = "nicely")
## Cut gene sets names that are too long
V(g)$name[nchar(V(g)$name) > 50] <- paste0(substr(V(g)$name[nchar(V(g)$name) > 50], 1, 47), "...")

set.seed(1234) ## because graph plotting function is not deterministic
f4 <- ggraph(g, layout="nicely") +
  geom_edge_link(aes(edge_width = width), 
                 alpha=1, colour = "darkgrey") + 
  geom_node_point(aes(size = size,
                      fill = color),
                  shape = 21, color="grey10") + 
  geom_node_text(aes(label=name),
                 size = 3, bg.color = "white", repel=TRUE) + 
  scale_size_continuous(name = "Number of genes", limits = c(10,100), range=c(1,10)) + 
  scale_edge_width_continuous(name = "Jaccard overlap", limits = c(0.1,1), range = c(0.1, 2)) +
  scale_fill_gradient2(name = "aveLog2FC", low = myPalette[2], high = myPalette[1]) + 
  coord_equal() + 
  theme_void() 
f4

## Wikipathways results ########################################################
myResults <- resRoast$C2.CP.WIKIPATHWAYS.CATEGORY.v2023.2.Hs.ensembl.hsa
myResults <- myResults[myResults$FDR < 0.05 & 
                         myResults$NGenes < 100 &
                         abs(myResults$aveLog2FC) > 0.1, ] 
dim(myResults)
## [1] 25  7
geneSets <- C2.CP.WIKIPATHWAYS.CATEGORY.v2023.2.Hs.ensembl.hsa[row.names(myResults)]
n <- length(geneSets)
w <- matrix(NA, nrow = n, ncol = n)
colnames(w) <- rownames(w) <- names(geneSets)
for (i in seq_len(n - 1)) {
  for (j in (i + 1):n) {
    w[i, j] <-  length(intersect(geneSets[[i]], geneSets[[j]]))/length(unique(c(geneSets[[i]], geneSets[[j]])))
  }
}
pair_sim <- w[row.names(myResults), row.names(myResults)]
row.names(myResults) <- gsub("^WP_", "", row.names(myResults))
colnames(pair_sim) <- rownames(pair_sim) <- row.names(myResults)

g <- prepare_graph(myResults,
                   min_edge = 0.1, 
                   pair_sim = pair_sim,
                   NGenes = "NGenes", 
                   colVar = "aveLog2FC", 
                   layout = "nicely")
V(g)$name[nchar(V(g)$name) > 50] <- paste0(substr(V(g)$name[nchar(V(g)$name) > 50], 1, 47), "...")

set.seed(1234) ## because graph plotting function is not deterministic
fs4p1 <- ggraph(g, layout="nicely") +
  geom_edge_link(aes(edge_width = width), 
                 alpha=1, colour = "darkgrey") + 
  geom_node_point(aes(size = size,
                      fill = color),
                  shape = 21, color="grey10") + 
  geom_node_text(aes(label=name),
                 size = 3, bg.color = "white", repel=TRUE) + 
  scale_size_continuous(name = "Number of genes", limits = c(10,100), range=c(1,10)) + 
  scale_edge_width_continuous(name = "Jaccard overlap", limits = c(0.1,1), range = c(0.1, 2)) +
  scale_fill_gradient2(name = "aveLog2FC", low = myPalette[2], high = myPalette[1]) + 
  coord_equal() + 
  theme_void() 
fs4p1

## Gene Ontology results #######################################################
myResults <- resRoast$updatedGO
myResults <- myResults[myResults$FDR < 0.05 & 
                         myResults$NGenes < 100 & 
                         abs(myResults$aveLog2FC) > 0.1, ] 
dim(myResults)
## [1] 129  11
geneSets <- updatedGO[row.names(myResults)]
n <- length(geneSets)
w <- matrix(NA, nrow = n, ncol = n)
colnames(w) <- rownames(w) <- names(geneSets)
for (i in seq_len(n - 1)) {
  for (j in (i + 1):n) {
    w[i, j] <-  length(intersect(geneSets[[i]], geneSets[[j]]))/length(unique(c(geneSets[[i]], geneSets[[j]])))
  }
}
pair_sim <- w[row.names(myResults), row.names(myResults)]
row.names(myResults) <- str_to_title(myResults$TERM)
colnames(pair_sim) <- rownames(pair_sim) <- row.names(myResults)

g <- prepare_graph(myResults,
                   min_edge = 0.1, 
                   pair_sim = pair_sim,
                   NGenes = "NGenes", 
                   colVar = "aveLog2FC", 
                   layout = "nicely")
V(g)$name[nchar(V(g)$name) > 50] <- paste0(substr(V(g)$name[nchar(V(g)$name) > 50], 1, 47), "...")

set.seed(1234) ## because graph plotting function is not deterministic
fs4p2 <- ggraph(g, layout="nicely") +
  geom_edge_link(aes(edge_width = width), 
                 alpha=1, colour = "darkgrey") + 
  geom_node_point(aes(size = size,
                      fill = color),
                  shape = 21, color="grey10") + 
  geom_node_text(aes(label=name),
                 size = 3, bg.color = "white", repel=TRUE) + 
  scale_size_continuous(name = "Number of genes", limits = c(10,100), range=c(1,10)) + 
  scale_edge_width_continuous(name = "Jaccard overlap", limits = c(0.1,1), range = c(0.1, 2)) +
  scale_fill_gradient2(name = "aveLog2FC", low = myPalette[2], high = myPalette[1]) + 
  coord_equal() + 
  theme_void() 
fs4p2

Figure 5

Inference of cell type abundances with xCell, available here: https://github.com/dviraran/xCell

# Normalizing to gene length is required. We use the total length of genomic regions used for counting by FeatureCounts to calculate RPKMs
exprMatrix <- log2cpm.loess - log2( dge.subset$genes$LENGTH / 1000 )
row.names(exprMatrix) <- dge.subset$genes$SYMBOL
## Remove duplicated symbols
exprMatrix <- exprMatrix[ !duplicated(row.names(exprMatrix)), ]

## Filtering of cell types: 
# - we do not run on cell types which are not expected in blood 
# - we excluded as a 2nd step the cell types which gave enrichment scores ~ 0 in almost all samples (Tgd cells, CD4+ naive T-cells and NK cells)        
set.seed(1234) 
xCellresults <- xCellAnalysis(exprMatrix, cell.types.use = c("B-cells", "naive B-cells",  "Memory B-cells", 
                                                             "Class-switched memory B-cells","Plasma cells", 
                                                             "CD4+ T-cells", "CD4+ memory T-cells", "CD4+ Tcm", "CD4+ Tem", 
                                                             "CD8+ T-cells", "CD8+ naive T-cells", "CD8+ Tcm", "CD8+ Tem",
                                                             "Th2 cells", "Th1 cells", "Tregs", 
                                                             "NKT", 
                                                             "Monocytes", "Macrophages", "Macrophages M1", "Macrophages M2", 
                                                             "DC", "aDC", "cDC", "iDC", "pDC", 
                                                             "Neutrophils", 
                                                             "Erythrocytes", "Platelets", "Megakaryocytes", 
                                                             "Basophils", "Eosinophils", "Mast cells"))

## Significance testing with limma
all(colnames(dge.subset) == colnames(xCellresults))
fit.ct <- lmFit(xCellresults, moma) 
fit2.ct <- contrasts.fit(fit.ct, contrasts.matrix)
fit2.ct <- eBayes(fit2.ct, trend=TRUE, robust=TRUE)
## Only 3 cell types significant at FDR=5%. Use a more lenient FDR cutoff of 20%
table(de <- decideTests(fit2.ct, p.value = 0.2))
top <- topTable(fit2.ct, coef=1, p.value=0.2, n=10, sort.by = "P")
top

## Plot the significant cell types 
myDf <- xCellresults[row.names(xCellresults) %in% row.names(top), ] |>
  as_tibble(rownames = NA) |> 
  rownames_to_column() |>
  dplyr::rename("Cell Type" = rowname) |>
  pivot_longer(cols= colnames(xCellresults), 
               names_to = "Sample",
               values_to = "Score") |> 
  dplyr::left_join(y=as_tibble(dge.subset$samples), by = join_by("Sample")) 
## Reorder levels
myDf$Patient.subgroup <- factor(myDf$Patient.subgroup, levels = c("No diabetes", "Insulin dependent T2D / No remission", "T2D / No remission", "T2D / Remission", "Unknown"))
## Reorder of cell types according to direction and significance
myDf$`Cell Type` <- factor(myDf$`Cell Type`, levels=c(row.names(top)[top$logFC > 0], row.names(top)[top$logFC < 0]))

f5 <- ggplot(myDf, aes(x=Visit, y=Score, group=Patient, col=Patient.subgroup)) +
  facet_wrap( ~ `Cell Type`, scales = "free_y", ncol = 5) +
  geom_line(position = position_dodge(0.2), alpha = .5) +
  geom_point(position = position_dodge(0.2), alpha = .8, size=2) +
  scale_colour_manual(name="Patient subgroup", values=setNames(myPalette[c(9,5,6,7,8)], c("No diabetes", "Insulin dependent T2D / No remission", "T2D / No remission", "T2D / Remission", "unknown"))) +
  scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0, .05))) +
  ylab(bquote('Enrichment score')) +
  xlab("") +
  myTheme + 
  theme(strip.background = element_blank(), 
        strip.text.x = element_text(size = 14, vjust = 0, face = "bold"),
        strip.clip = "off",
        panel.spacing.x = unit(3, "points"), 
        panel.spacing.y = unit(0, "points"), 
        axis.text.y = element_text(size = 8), 
        legend.position="bottom")
f5

Figure 6

Homuth et al.

https://bmcmedgenomics.biomedcentral.com/articles/10.1186/s12920-015-0141-x

## Download Additional file 2 from paper
if(!file.exists("data/Homuth.xlsx")){
  download.file("https://static-content.springer.com/esm/art%3A10.1186%2Fs12920-015-0141-x/MediaObjects/12920_2015_141_MOESM2_ESM.xlsx", destfile = "data/Homuth.xlsx")
}
## Supplementary Table 34:  Results of sensitivity analyses in KORA F4 "BASIC MODEL: Adjustment for age, sex, technical covariates, and blood cell counts"
tab <- readxl::read_excel("data/Homuth.xlsx", sheet = 36, skip = 1)
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `n total` -> `n total...7`
## • `Effect` -> `Effect...8`
## • `SE` -> `SE...9`
## • `p-value` -> `p-value...10`
## • `p-value (BH)` -> `p-value (BH)...11`
## • `n total` -> `n total...12`
## • `Effect` -> `Effect...13`
## • `SE` -> `SE...14`
## • `p-value` -> `p-value...15`
## • `p-value (BH)` -> `p-value (BH)...16`
## • `n total` -> `n total...17`
## • `Effect` -> `Effect...18`
## • `SE` -> `SE...19`
## • `p-value` -> `p-value...20`
## • `p-value (BH)` -> `p-value (BH)...21`
## Refresh illumina array annotation
tab$GENEID <- AnnotationDbi::mapIds(illuminaHumanv3.db, keys = tab$`...2`, keytype = "PROBEID", column = "ENSEMBL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
## Average effect size of probes matching to same Ensembl gene
tab <- tab[!is.na(tab$GENEID), ]
tab <- avereps(tab$`Effect...8`, tab$GENEID)
colnames(tab)[1] <- "effect_size"

## Load our data
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
## Merge
myDf <- merge(top.x, tab, by.x="GENEID", by.y=0) 

## Total least squares regression 
v <- prcomp(myDf[, c("logFC", "effect_size")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]

## Labeling specific categories of genes: we use the clusters of proteins (encoded by significantly down-regulated genes) from the STRINGdb analysis (Figure S3)
stringClusters <- read.table("data/string_MCL_clusters.tsv", h=T, sep="\t", quote="", comment.char = "")
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin" 
myDf$highlight[grep("^MT-", myDf$SYMBOL)] <- "Mitochondrial" 
table(myDf$highlight)
## 
## Erythrocytes   Hemoglobin  Neutrophils         NFKB 
##           13           12           12           15
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ], 
              myDf[!is.na(myDf$highlight), ])

## Define colors to be used
myColorNeutro  <- myPalette[3]
myColorMito    <- myPalette[4]
myColorHemo    <- myPalette[5]
myColorErythro <- myPalette[6]
myColorNfkb    <- myPalette[7]

f6p1 <- ggplot(myDf, aes(x=logFC, y=effect_size)) + 
  ylab(bquote('Effect size')) +
  xlab(bquote('log'[2]~'Fold-Change')) +
  geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
  geom_density_2d(col="grey30") +
  
  ## Correlation coefficient
  stat_cor(label.x = 2.12, label.y = 0.077, aes(label = after_stat(r.label)), size=7, hjust=1) + 
  ## Regression line slope and intercept 
  geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +

  ## Fix limits 
  xlim(c(-2.12, 2.12)) +
  ylim(c(-0.077, 0.077)) +
  scale_color_manual(name="",
    limits=sort(unique(myDf$highlight)),
    values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
    na.value = "grey80") +
  scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
  scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
  annotate(geom = "text", x=-2.12, y=0, label="Effect of BMI (Homuth et al.)", angle = 90, size=5, fontface = 'italic') +
  annotate(geom = "text", x=0, y=-0.077, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
  myTheme
f6p1

Kalafati et al.

https://genesandnutrition.biomedcentral.com/articles/10.1186/s12263-021-00702-7

## Load DE analysis file obtained from the authors upon request
tab <- read.table("data/Kalafati.txt", sep="\t", h=T, row.names = 1)

## Load our data
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
## Merge
myDf <- merge(top.x, tab, by.x="GENEID", by.y=0) 

## One lncRNA behaves like an outlier, with log2FC of 18.4, let's filter it out
myDf <- myDf[myDf$GENEID != "ENSG00000250302", ]

## Total least squares regression 
v <- prcomp(myDf[, c("logFC", "log2FoldChange")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]

## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin" 
myDf$highlight[grep("^MT-", myDf$SYMBOL)] <- "Mitochondrial" 
table(myDf$highlight)
## 
## Erythrocytes   Hemoglobin  Neutrophils         NFKB 
##           12           12           13           14
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ], 
              myDf[!is.na(myDf$highlight), ])

f6p2 <- ggplot(myDf, aes(x=logFC, y=log2FoldChange)) + 
  ylab(bquote('log'[2]~'Fold-Change')) +
  xlab(bquote('log'[2]~'Fold-Change')) +
  geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
  geom_density_2d(col="grey30") +
  
  ## Correlation coefficient
  stat_cor(label.x = 2.12, label.y = 3, aes(label = after_stat(r.label)), size=7, hjust=1) + 
  ## Regression line slope and intercept 
  geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +

  ## Fix limits 
  xlim(c(-2.12, 2.12)) +
  ylim(c(-3, 3)) +
  scale_color_manual(name="",
    limits=sort(unique(myDf$highlight)),
    values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
    na.value = "grey80") +
  scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
  scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
  annotate(geom = "text", x=-2.12, y=0, label="Insulin-resistant vs. sensitive (Kalafati et al.)", angle = 90, size=5, fontface = 'italic') +
  annotate(geom = "text", x=0, y=-3, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
  myTheme 
f6p2
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_density2d()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_cor()`).
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).

Wesolowska-Andersen et al.

https://www.sciencedirect.com/science/article/pii/S2666379121003499

## Download Table S5F from paper
if(!file.exists("data/Wesolowska-Andersen.xlsx")){
  download.file("https://ars.els-cdn.com/content/image/1-s2.0-S2666379121003499-mmc6.xlsx", destfile = "data/Wesolowska-Andersen.xlsx")
}
## Focus on archetype B
tab <- readxl::read_excel("data/Wesolowska-Andersen.xlsx", sheet = 6, skip = 2)
## Process the gene IDs
tab$GENEID <- gsub("\\.\\d+$", "", tab$Transcript)

## Load our data
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
## Merge
myDf <- merge(top.x, tab, by="GENEID") 

## Total least squares regression 
v <- prcomp(myDf[, c("logFC", "B: estimate")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]

## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin" 
myDf$highlight[grep("^MT-", myDf$SYMBOL)] <- "Mitochondrial" 
table(myDf$highlight)
## 
## Erythrocytes   Hemoglobin  Neutrophils         NFKB 
##           13           11           12           14
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ], 
              myDf[!is.na(myDf$highlight), ])

f6p3 <- ggplot(myDf, aes(x=logFC, y=`B: estimate`)) + 
  ylab(bquote('Effect size')) +
  xlab(bquote('log'[2]~'Fold-Change')) +
  geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
  geom_density_2d(col="grey30") +
  
  ## Correlation coefficient
  stat_cor(label.x = 2.12, label.y = 0.04, aes(label = after_stat(r.label)), size=7, hjust=1) + 
  ## Regression line slope and intercept 
  geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +

  ## Fix limits 
  xlim(c(-2.12, 2.12)) +
  ylim(c(-0.04, 0.04)) + 
  scale_color_manual(name="",
    limits=sort(unique(myDf$highlight)),
    values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
    na.value = "grey80") +
  scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
  scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
  annotate(geom = "text", x=-2.12, y=0, label="T2D archetype B (Wesolowska-Andersen et al.)", angle = 90, size=5, fontface = 'italic') +
  annotate(geom = "text", x=0, y=-0.04, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
  myTheme 
f6p3

de Klerk et al.

https://link.springer.com/article/10.1007/s00125-023-05886-8

## Download Supp Table of paper
if(!file.exists("data/deKlerk.xlsx")){
  download.file("https://static-content.springer.com/esm/art%3A10.1007%2Fs00125-023-05886-8/MediaObjects/125_2023_5886_MOESM2_ESM.xlsx", destfile = "data/deKlerk.xlsx")
}
## Genes DE in MOD T2D cluster
tab <- readxl::read_excel(path="data/deKlerk.xlsx", sheet = 6, range = "O2:T7931")

## Load our data
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
## Merge
myDf <- merge(top.x, tab, by.x="SYMBOL", by.y="Gene name") 

## Total least squares regression 
v <- prcomp(myDf[, c("logFC.x", "logFC.y")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]

## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes" 
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin" 
myDf$highlight[grep("^MT-", myDf$SYMBOL)] <- "Mitochondrial" 
table(myDf$highlight)
## 
##  Erythrocytes    Hemoglobin Mitochondrial   Neutrophils          NFKB 
##             7             3             6             7             9
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ], 
              myDf[!is.na(myDf$highlight), ])

f6p4 <- ggplot(myDf, aes(x=logFC.x, y=logFC.y)) + 
  ylab(bquote('log'[2]~'Fold-Change')) +
  xlab(bquote('log'[2]~'Fold-Change')) +
  geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
  geom_density_2d(col="grey30") +
  
  ## Correlation coefficient
  stat_cor(label.x = 2.12, label.y = 1, aes(label = after_stat(r.label)), size=7, hjust=1) + 
  ## Regression line slope and intercept 
  geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +

  ## Fix limits 
  xlim(c(-2.12, 2.12)) +
  ylim(c(-1, 1)) +
  scale_color_manual(name="",
    limits=sort(unique(myDf$highlight)),
    values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
    na.value = "grey80") +
  scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
  scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
  annotate(geom = "text", x=-2.12, y=0, label="Mild obesity-related T2D (de Klerk et al.)", angle = 90, size=5, fontface = 'italic') +
  annotate(geom = "text", x=0, y=-1, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
  myTheme 
f6p4

Figure 7

Liu et al.

https://www.frontiersin.org/journals/endocrinology/articles/10.3389/fendo.2023.1049484/full

The dataset was downloaded from SRA (BioProject ID PRJNA861382) and reprocessed similar to our dataset (STAR + featureCounts).

dge.liu <- readRDS("data/Liu_DGEList.rds")
dge.liu$samples
##             group lib.size norm.factors      Donor    sex
## SRR20647845  post 22207976    0.9380378  Patient_8 Female
## SRR20647846  post 21164241    0.8767257  Patient_7 Female
## SRR20647854   pre 22508471    0.8465755  Patient_1 Female
## SRR20647836   pre 18414447    1.0485288  Patient_9 Female
## SRR20647841   pre 21166742    1.0296887  Patient_4 Female
## SRR20647847  post 18265668    1.0861937  Patient_6 Female
## SRR20647849  post 19149312    1.0266193  Patient_4 Female
## SRR20647843  post 20721058    0.9899505 Patient_10 Female
## SRR20647835   pre 19499648    1.0387250 Patient_10 Female
## SRR20647838   pre 22499441    0.9389552  Patient_7 Female
## SRR20647842   pre 23128101    1.0125301  Patient_3   Male
## SRR20647844  post 17560932    1.0830938  Patient_9 Female
## SRR20647851  post 15150392    0.9724933  Patient_2   Male
## SRR20647850  post 18867523    0.9853841  Patient_3   Male
## SRR20647837   pre 20081873    0.8965124  Patient_8 Female
## SRR20647839   pre 22766645    1.0812238  Patient_6 Female
## SRR20647840   pre 22421969    1.0192733  Patient_5 Female
## SRR20647848  post 18032982    1.0404871  Patient_5 Female
## SRR20647852  post 16732593    1.0517515  Patient_1 Female
## SRR20647853   pre 19830022    1.0874513  Patient_2   Male
## Filter lowly expressed genes
keep <- rowSums(edgeR::cpm(dge.liu) > 1) >= 3
## Exclude LRG loci
keep <- keep & !dge.liu$genes$GENEBIOTYPE %in% c("LRG_gene")
sum(keep) 
## [1] 18648
dge.liu <- dge.liu[keep, , keep.lib.sizes=FALSE] 
dge.liu <- calcNormFactors(dge.liu) 

## design matrix 
moma.liu <- model.matrix(~ 0 + group + Donor, data=dge.liu$samples)
colnames(moma.liu) <- gsub("group", "", colnames(moma.liu)) 

## Contrast matrix
contrasts.matrix.liu <- makeContrasts(
  Post_vs_pre = post - pre,
  levels=moma.liu
)

v <- voom(dge.liu, moma.liu, plot=TRUE, normalize.method = "cyclicloess")

fit <- lmFit(v, moma.liu)
fit2.liu <- contrasts.fit(fit, contrasts.matrix.liu)
fit2.liu <- eBayes(fit2.liu, robust=TRUE) 
table(de <- decideTests(fit2.liu, p.value = 0.05))
## 
##    -1     0     1 
##   170 18187   291
topTable(fit2.liu, coef=1, p.value=0.05, n=10, sort.by = "P")
##                    source  SYMBOL                                                     DESCRIPTION                      GENEBIOTYPE ENTREZID
## ENSG00000165997 ensdb_110   ARL5B                          ADP ribosylation factor like GTPase 5B                   protein_coding   221079
## ENSG00000040633 ensdb_110   PHF23                                           PHD finger protein 23                   protein_coding    79142
## ENSG00000261026 ensdb_110                                   novel transcript, overlapping to EGR3                           lncRNA     <NA>
## ENSG00000130600 ensdb_110     H19                   H19 imprinted maternally expressed transcript                           lncRNA   283120
## ENSG00000118503 ensdb_110 TNFAIP3                                     TNF alpha induced protein 3                   protein_coding     7128
## ENSG00000019186 ensdb_110 CYP24A1                  cytochrome P450 family 24 subfamily A member 1                   protein_coding     1591
## ENSG00000279192 ensdb_110   PWAR5                              Prader Willi/Angelman region RNA 5                              TEC     <NA>
## ENSG00000115009 ensdb_110   CCL20                                   C-C motif chemokine ligand 20                   protein_coding     6364
## ENSG00000249138 ensdb_110   SLED1 proteoglycan 3, pro eosinophil major basic protein 2 pseudogene transcribed_processed_pseudogene   643036
## ENSG00000171940 ensdb_110  ZNF217                                         zinc finger protein 217                   protein_coding     7764
##                    UNIPROT LENGTH          GENEID     METABIOTYPE      logFC    AveExpr          t      P.Value    adj.P.Val        B
## ENSG00000165997     B0YIW9  22209 ENSG00000165997  protein_coding  1.6671176  5.9175446  11.541252 4.712802e-08 0.0008788433 9.005843
## ENSG00000040633     Q9BUL5   4694 ENSG00000040633  protein_coding -0.6102185  4.5412871 -10.575801 1.283630e-07 0.0011968568 8.045979
## ENSG00000261026       <NA>   4997 ENSG00000261026 long_non_coding  3.8880188 -0.2657712   9.984821 2.460841e-07 0.0015296585 4.602253
## ENSG00000130600       <NA>   9387 ENSG00000130600 long_non_coding  5.3886969 -1.0689809   9.708224 4.621405e-07 0.0018464388 6.131134
## ENSG00000118503     Q5VXR0  16101 ENSG00000118503  protein_coding  2.4512651  9.2667332   9.645060 4.959199e-07 0.0018464388 6.771843
## ENSG00000019186     Q07973  20541 ENSG00000019186  protein_coding -4.1040213 -1.6357374  -9.226407 5.940923e-07 0.0018464388 3.924560
## ENSG00000279192       <NA>   3180 ENSG00000279192            <NA>  0.7557717  3.1271397   9.042741 7.416132e-07 0.0019756576 6.285701
## ENSG00000115009 A0A2R8Y806  11826 ENSG00000115009  protein_coding  4.0117225  1.1834510   8.716809 1.461069e-06 0.0027041740 4.929147
## ENSG00000249138       <NA>    751 ENSG00000249138      pseudogene  3.6836561 -0.4619774   8.484649 1.486413e-06 0.0027041740 3.654499
## ENSG00000171940     O75362  42837 ENSG00000171940  protein_coding -0.5928820  6.9140751  -8.462452 1.529141e-06 0.0027041740 5.638985
## Comparison of fold changes ##################################################
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
top.y <- topTable(fit2.liu, coef="Post_vs_pre", sort.by = "none", n=Inf)
myDf <- merge(top.x, top.y, by="GENEID") 

## Total least squares regression 
v <- prcomp(myDf[, c("logFC.x", "logFC.y")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
eq <- as.expression(substitute(italic(y) == b %.% italic(x) + a, 
                               list(a = format(unname(int), digits = 2),
                                    b = format(unname(slp), digits = 2))))

## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin" 
myDf$highlight[grep("^MT-", myDf$SYMBOL.x)] <- "Mitochondrial" 
table(myDf$highlight)
## 
##  Erythrocytes    Hemoglobin Mitochondrial   Neutrophils          NFKB 
##            11             4            34            12            15
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ], 
              myDf[!is.na(myDf$highlight), ])

f7p1 <- ggplot(myDf, aes(x=logFC.x, y=logFC.y)) + 
  ylab(bquote('log'[2]~'Fold-Change')) +
  xlab(bquote('log'[2]~'Fold-Change')) +
  geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
  geom_density_2d(col="grey30") +
  
  ## Correlation coefficient
  stat_cor(label.x = -3.65, label.y = 3.65, aes(label = after_stat(r.label)), size=7, hjust=0) + 
  ## Regression line slope and intercept 
  geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
  annotate(geom="text", x=-3.65, y=3.65-0.53, 
           label=eq, parse = TRUE, # See above
           size=7, hjust=0, vjust=0) +

  ## Fix limits so that range is equal on x and y axes
  xlim(c(-3.65, 3.65)) +
  ylim(c(-3.65, 3.65)) +
  scale_color_manual(name="",
    limits=sort(unique(myDf$highlight)),
    values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
    na.value = "grey80") +
  scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
  scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
  annotate(geom = "text", x=-3.65, y=0, label="1 month post vs. pre-surgery (Liu et al.)", angle = 90, size=5, fontface = 'italic') +
  annotate(geom = "text", x=0, y=-3.65, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
  myTheme 
f7p1

Rashid et al.

https://academic.oup.com/jes/article/8/1/bvad159/7484625

## Raw .CEL files are available from GEO under accession GSE271700
if(!dir.exists("data/GSE271700_RAW/")){
  download.file("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE271nnn/GSE271700/suppl/GSE271700_RAW.tar", destfile = "data/GSE271700_RAW.tar")
  untar("data/GSE271700_RAW.tar", exdir = "data/GSE271700_RAW/")
  file.remove("data/GSE271700_RAW.tar")
}
## Read-in and MA normalization
eset <- justRMA(celfile.path="data/GSE271700_RAW/")
## 
fData(eset)$GENEID <- AnnotationDbi::mapIds(hgu133plus2.db, keys = row.names(eset), keytype = "PROBEID", column = "ENSEMBL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
fData(eset)$SYMBOL <- AnnotationDbi::mapIds(hgu133plus2.db, keys = row.names(eset), keytype = "PROBEID", column = "SYMBOL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
## Filter out probes without an Ensembl ID
eset <- eset[!is.na(fData(eset)$GENEID)] 
## Average expression of probes matching to same Ensembl gene
eset <- ExpressionSet(assayData = avereps(eset, ID=fData(eset)$GENEID), 
                      phenoData = phenoData(eset))
fData(eset)$GENEID <- row.names(eset)
fData(eset)$SYMBOL <- mapIds(edb, key=row.names(eset), keytype="GENEID", column="SYMBOL")
fData(eset)$GENENAME <- mapIds(edb, key=row.names(eset), keytype="GENEID", column="DESCRIPTION")
## Sample info
pData(eset)$visit <- factor(strsplit2(colnames(eset), split = "_")[, 2])
pData(eset)$Donor <- factor(strsplit2(colnames(eset), split = "_")[, 3])

## normalization
exprs(eset) <- normalizeBetweenArrays(exprs(eset), method = "cyclicloess")
plotDensities(exprs(eset), group = pData(eset)$visit, legend="right")

## DE analysis #################################################################
## Pre vs. after 2 months
eset.subset <- eset[, pData(eset)$visit %in% c("T0", "T1")]
## Remove unpaired donors
eset.subset <- eset.subset[, pData(eset.subset)$Donor %in% names(which(table(pData(eset.subset)$Donor) == 2))]
dim(eset.subset) 
## Features  Samples 
##    20202       46
pData(eset.subset)$Donor <- factor(pData(eset.subset)$Donor)
pData(eset.subset)$visit <- factor(pData(eset.subset)$visit)

## design matrix: for now ignore subgroup
moma.rashid <- model.matrix(~ 0 + visit + Donor, data=pData(eset.subset))
colnames(moma.rashid) <- gsub("visit", "", colnames(moma.rashid)) 

## Contrast matrix
contrasts.matrix.rashid <- makeContrasts(
  Post_vs_pre = T1 - T0,
  levels=moma.rashid
)
fit <- lmFit(eset.subset, moma.rashid)
keep <- fit$Amean > 4
fit2.rashid.2months <- contrasts.fit(fit[keep, ], contrasts.matrix.rashid)
fit2.rashid.2months <- eBayes(fit2.rashid.2months, trend = TRUE, robust=TRUE) 
table(de <- decideTests(fit2.rashid.2months, p.value = 0.05))
## 
##     0 
## 15519
# Top genes
topTable(fit2.rashid.2months, coef=1, p.value=1, n=10, sort.by = "P")
##                          GENEID     SYMBOL                                                                                     GENENAME      logFC
## ENSG00000206047 ENSG00000206047      DEFA1                                          defensin alpha 1 [Source:HGNC Symbol;Acc:HGNC:2761] -1.6568855
## ENSG00000121634 ENSG00000121634       GJA8                              gap junction protein alpha 8 [Source:HGNC Symbol;Acc:HGNC:4281] -0.3013184
## ENSG00000152292 ENSG00000152292      SH2D6                                  SH2 domain containing 6 [Source:HGNC Symbol;Acc:HGNC:30439] -0.3743990
## ENSG00000126790 ENSG00000126790    L3HYPDH                     trans-L-3-hydroxyproline dehydratase [Source:HGNC Symbol;Acc:HGNC:20488]  0.4584110
## ENSG00000117569 ENSG00000117569      PTBP2                   polypyrimidine tract binding protein 2 [Source:HGNC Symbol;Acc:HGNC:17662]  0.4918848
## ENSG00000166432 ENSG00000166432      ZMAT1                                zinc finger matrin-type 1 [Source:HGNC Symbol;Acc:HGNC:29377]  0.5366304
## ENSG00000185619 ENSG00000185619      PCGF3                             polycomb group ring finger 3 [Source:HGNC Symbol;Acc:HGNC:10066]  0.3483125
## ENSG00000206530 ENSG00000206530     CFAP44                 cilia and flagella associated protein 44 [Source:HGNC Symbol;Acc:HGNC:25631]  0.3579104
## ENSG00000003509 ENSG00000003509    NDUFAF7 NADH:ubiquinone oxidoreductase complex assembly factor 7 [Source:HGNC Symbol;Acc:HGNC:28816]  0.3566390
## ENSG00000262223 ENSG00000262223 AC110285.1                                                                             novel transcript -0.3090207
##                  AveExpr         t      P.Value adj.P.Val         B
## ENSG00000206047 8.097970 -5.524585 6.960388e-06 0.1080183 3.4342260
## ENSG00000121634 5.019012 -4.916501 3.556657e-05 0.1400654 2.0833849
## ENSG00000152292 4.864189 -4.721844 6.133142e-05 0.1400654 1.6290041
## ENSG00000126790 4.422412  4.634457 7.773531e-05 0.1400654 1.4307382
## ENSG00000117569 5.131354  4.626606 7.940756e-05 0.1400654 1.4129216
## ENSG00000166432 4.438956  4.488288 1.154925e-04 0.1400654 1.0990254
## ENSG00000185619 5.321044  4.459633 1.233731e-04 0.1400654 1.0430452
## ENSG00000206530 4.123562  4.462549 1.238193e-04 0.1400654 1.0406300
## ENSG00000003509 5.379055  4.413380 1.403105e-04 0.1400654 0.9352602
## ENSG00000262223 5.152049 -4.307382 1.860634e-04 0.1400654 0.6979276
## Same for pre vs. after 1 year ###############################################
eset.subset <- eset[, pData(eset)$visit %in% c("T0", "T2")]
## Remove unpaired donors
eset.subset <- eset.subset[, pData(eset.subset)$Donor %in% names(which(table(pData(eset.subset)$Donor) == 2))]
dim(eset.subset) 
## Features  Samples 
##    20202       48
pData(eset.subset)$Donor <- factor(pData(eset.subset)$Donor)
pData(eset.subset)$visit <- factor(pData(eset.subset)$visit)

## design matrix: for now ignore subgroup
moma.rashid <- model.matrix(~ 0 + visit + Donor, data=pData(eset.subset))
colnames(moma.rashid) <- gsub("visit", "", colnames(moma.rashid)) 

## Contrast matrix
contrasts.matrix.rashid <- makeContrasts(
  Post_vs_pre = T2 - T0,
  levels=moma.rashid
)
fit <- lmFit(eset.subset, moma.rashid)
keep <- fit$Amean > 4
fit2.rashid.1year <- contrasts.fit(fit[keep, ], contrasts.matrix.rashid)
fit2.rashid.1year <- eBayes(fit2.rashid.1year, trend = TRUE, robust=TRUE) 
table(de <- decideTests(fit2.rashid.1year, p.value = 0.05))
## 
##     0 
## 15551
## but many genes between are DE with an FDR between 5 and 10% 
table(de <- decideTests(fit2.rashid.1year, p.value = 0.1))
## 
##    -1     0     1 
##  1719 11814  2018
topTable(fit2.liu, coef=1, p.value=0.1, n=10, sort.by = "P")
##                    source  SYMBOL                                                     DESCRIPTION                      GENEBIOTYPE ENTREZID
## ENSG00000165997 ensdb_110   ARL5B                          ADP ribosylation factor like GTPase 5B                   protein_coding   221079
## ENSG00000040633 ensdb_110   PHF23                                           PHD finger protein 23                   protein_coding    79142
## ENSG00000261026 ensdb_110                                   novel transcript, overlapping to EGR3                           lncRNA     <NA>
## ENSG00000130600 ensdb_110     H19                   H19 imprinted maternally expressed transcript                           lncRNA   283120
## ENSG00000118503 ensdb_110 TNFAIP3                                     TNF alpha induced protein 3                   protein_coding     7128
## ENSG00000019186 ensdb_110 CYP24A1                  cytochrome P450 family 24 subfamily A member 1                   protein_coding     1591
## ENSG00000279192 ensdb_110   PWAR5                              Prader Willi/Angelman region RNA 5                              TEC     <NA>
## ENSG00000115009 ensdb_110   CCL20                                   C-C motif chemokine ligand 20                   protein_coding     6364
## ENSG00000249138 ensdb_110   SLED1 proteoglycan 3, pro eosinophil major basic protein 2 pseudogene transcribed_processed_pseudogene   643036
## ENSG00000171940 ensdb_110  ZNF217                                         zinc finger protein 217                   protein_coding     7764
##                    UNIPROT LENGTH          GENEID     METABIOTYPE      logFC    AveExpr          t      P.Value    adj.P.Val        B
## ENSG00000165997     B0YIW9  22209 ENSG00000165997  protein_coding  1.6671176  5.9175446  11.541252 4.712802e-08 0.0008788433 9.005843
## ENSG00000040633     Q9BUL5   4694 ENSG00000040633  protein_coding -0.6102185  4.5412871 -10.575801 1.283630e-07 0.0011968568 8.045979
## ENSG00000261026       <NA>   4997 ENSG00000261026 long_non_coding  3.8880188 -0.2657712   9.984821 2.460841e-07 0.0015296585 4.602253
## ENSG00000130600       <NA>   9387 ENSG00000130600 long_non_coding  5.3886969 -1.0689809   9.708224 4.621405e-07 0.0018464388 6.131134
## ENSG00000118503     Q5VXR0  16101 ENSG00000118503  protein_coding  2.4512651  9.2667332   9.645060 4.959199e-07 0.0018464388 6.771843
## ENSG00000019186     Q07973  20541 ENSG00000019186  protein_coding -4.1040213 -1.6357374  -9.226407 5.940923e-07 0.0018464388 3.924560
## ENSG00000279192       <NA>   3180 ENSG00000279192            <NA>  0.7557717  3.1271397   9.042741 7.416132e-07 0.0019756576 6.285701
## ENSG00000115009 A0A2R8Y806  11826 ENSG00000115009  protein_coding  4.0117225  1.1834510   8.716809 1.461069e-06 0.0027041740 4.929147
## ENSG00000249138       <NA>    751 ENSG00000249138      pseudogene  3.6836561 -0.4619774   8.484649 1.486413e-06 0.0027041740 3.654499
## ENSG00000171940     O75362  42837 ENSG00000171940  protein_coding -0.5928820  6.9140751  -8.462452 1.529141e-06 0.0027041740 5.638985
## Comparison of fold changes: 2 months ########################################
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
top.y <- topTable(fit2.rashid.2months, coef="Post_vs_pre", sort.by = "none", n=Inf)
myDf <- merge(top.x, top.y, by="GENEID") 

## Total least squares regression 
v <- prcomp(myDf[, c("logFC.x", "logFC.y")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
eq <- as.expression(substitute(italic(y) == b %.% italic(x) + a, 
                               list(a = format(unname(int), digits = 2),
                                    b = format(unname(slp), digits = 2))))

## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin" 
myDf$highlight[grep("^MT-", myDf$SYMBOL.x)] <- "Mitochondrial" 
table(myDf$highlight)
## 
##  Erythrocytes    Hemoglobin Mitochondrial   Neutrophils          NFKB 
##            12            12             8            10            15
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ], 
              myDf[!is.na(myDf$highlight), ])

f7p2 <- ggplot(myDf, aes(x=logFC.x, y=logFC.y)) + 
  ylab(bquote('log'[2]~'Fold-Change')) +
  xlab(bquote('log'[2]~'Fold-Change')) +
  geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
  geom_density_2d(col="grey30") +
  
  ## Correlation coefficient
  stat_cor(label.x = -2.12, label.y = 2.12, aes(label = after_stat(r.label)), size=7, hjust=0) + 
  ## Regression line slope and intercept 
  geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
  annotate(geom="text", x=-2.12, y=1.8, 
           label=eq, parse = TRUE, # See above
           size=7, hjust=0, vjust=0) +

  ## Fix limits so that range is equal on x and y axes
  xlim(c(-2.12, 2.12)) +
  ylim(c(-2.12, 2.12)) +
  scale_color_manual(name="",
    limits=sort(unique(myDf$highlight)),
    values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
    na.value = "grey80") +
  scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
  scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
  annotate(geom = "text", x=-2.12, y=0, label="2 months post vs. pre-surgery (Rashid et al.)", angle = 90, size=5, fontface = 'italic') +
  annotate(geom = "text", x=0, y=-2.12, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
  myTheme 
f7p2

## Comparison of fold changes: 1 year ########################################
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
top.y <- topTable(fit2.rashid.1year, coef="Post_vs_pre", sort.by = "none", n=Inf)
myDf <- merge(top.x, top.y, by="GENEID") 

## Total least squares regression 
v <- prcomp(myDf[, c("logFC.x", "logFC.y")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
eq <- as.expression(substitute(italic(y) == b %.% italic(x) + a, 
                               list(a = format(unname(int), digits = 2),
                                    b = format(unname(slp), digits = 2))))

## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin" 
myDf$highlight[grep("^MT-", myDf$SYMBOL.x)] <- "Mitochondrial" 
table(myDf$highlight)
## 
##  Erythrocytes    Hemoglobin Mitochondrial   Neutrophils          NFKB 
##            12            12             8            10            15
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ], 
              myDf[!is.na(myDf$highlight), ])

f7p4 <- ggplot(myDf, aes(x=logFC.x, y=logFC.y)) + 
  ylab(bquote('log'[2]~'Fold-Change')) +
  xlab(bquote('log'[2]~'Fold-Change')) +
  geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
  geom_density_2d(col="grey30") +
  
  ## Correlation coefficient
  stat_cor(label.x = -2.12, label.y = 2.12, aes(label = after_stat(r.label)), size=7, hjust=0) + 
  ## Regression line slope and intercept 
  geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
  annotate(geom="text", x=-2.12, y=1.8, 
           label=eq, parse = TRUE, # See above
           size=7, hjust=0, vjust=0) +

  ## Fix limits so that range is equal on x and y axes
  xlim(c(-2.12, 2.12)) +
  ylim(c(-2.12, 2.12)) +
  scale_color_manual(name="",
    limits=sort(unique(myDf$highlight)),
    values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
    na.value = "grey80") +
  scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
  scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
  annotate(geom = "text", x=-2.12, y=0, label="12 months post vs. pre-surgery (Rashid et al.)", angle = 90, size=5, fontface = 'italic') +
  annotate(geom = "text", x=0, y=-2.12, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
  myTheme 
f7p4

Berisha et al.

https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0016729

# load series and platform data from GEO
gset <- getGEO("GSE19790", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE19790_series_matrix.txt.gz
gset <- gset[[1]]

## Update the probe annotation: Illumina human-6 v2.0 expression beadchip
fData(gset)$GENEID <- AnnotationDbi::mapIds(illuminaHumanv2.db, keys = row.names(gset), keytype = "PROBEID", column = "ENSEMBL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
fData(gset)$SYMBOL <- AnnotationDbi::mapIds(illuminaHumanv2.db, keys = row.names(gset), keytype = "PROBEID", column = "SYMBOL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
## Sample info
pData(gset)$group <- factor(gsub("Subject\\s\\d+\\s(pre|post)\\ssurgery" , "\\1", pData(gset)$title))
pData(gset)$Donor <- factor(gsub("Subject\\s(\\d+)\\s(pre|post)\\ssurgery" , "\\1", pData(gset)$title))

plotDensities(exprs(gset), group = pData(gset)$Donor, legend="right")

plotDensities(log2(exprs(gset)), group = pData(gset)$Donor, legend="right")

## Data was quantile normalized already, we simply need to log transform 
exprs(gset) <- log2(exprs(gset))

## Filter out probes without an Ensembl ID
gset <- gset[!is.na(fData(gset)$GENEID)] 
## Average expression of probes matching to same Ensembl ID
gset <- ExpressionSet(assayData = avereps(gset, ID=fData(gset)$GENEID), 
                      phenoData = phenoData(gset))

fData(gset)$GENEID <- row.names(gset)
fData(gset)$SYMBOL <- mapIds(edb, key=row.names(gset), keytype="GENEID", column="SYMBOL")
fData(gset)$UNIPROT <- mapIds(edb, key=row.names(gset), keytype="GENEID", column="UNIPROTID") 
fData(gset)$GENENAME <- mapIds(edb, key=row.names(gset), keytype="GENEID", column="DESCRIPTION")

## DE analysis #################################################################
## design matrix 
moma.berisha <- model.matrix(~ 0 + group + Donor, data=pData(gset))
colnames(moma.berisha) <- gsub("group", "", colnames(moma.berisha)) 

## Contrast matrix
contrasts.matrix.berisha <- makeContrasts(
  Post_vs_pre = post - pre,
  levels=moma.berisha
)
fit <- lmFit(gset, moma.berisha)
keep <- fit$Amean > 6.5 # arbitrary filter on expression level
fit2.berisha <- contrasts.fit(fit[keep, ], contrasts.matrix.berisha)
fit2.berisha <- eBayes(fit2.berisha, trend = TRUE, robust=TRUE)
table(de <- decideTests(fit2.berisha, p.value = 0.2)) ## relaxed FDR cutoff here!
## 
##    -1     0     1 
##     7 12930     1
topTable(fit2.berisha)
##                          GENEID  SYMBOL       UNIPROT                                                                              GENENAME
## ENSG00000148346 ENSG00000148346    LCN2    P80188.206                                        lipocalin 2 [Source:HGNC Symbol;Acc:HGNC:6526]
## ENSG00000124469 ENSG00000124469 CEACAM8    Q0Z7S6.114                       CEA cell adhesion molecule 8 [Source:HGNC Symbol;Acc:HGNC:1820]
## ENSG00000164047 ENSG00000164047    CAMP  A0A384NPR0.7                 cathelicidin antimicrobial peptide [Source:HGNC Symbol;Acc:HGNC:1472]
## ENSG00000173391 ENSG00000173391    OLR1 A0A024RAU0.50        oxidized low density lipoprotein receptor 1 [Source:HGNC Symbol;Acc:HGNC:8133]
## ENSG00000132950 ENSG00000132950   ZMYM5          <NA>                 zinc finger MYM-type containing 5 [Source:HGNC Symbol;Acc:HGNC:13029]
## ENSG00000206047 ENSG00000206047   DEFA1    P59665.167                                   defensin alpha 1 [Source:HGNC Symbol;Acc:HGNC:2761]
## ENSG00000164821 ENSG00000164821   DEFA4    P12838.160                                   defensin alpha 4 [Source:HGNC Symbol;Acc:HGNC:2763]
## ENSG00000169397 ENSG00000169397  RNASE3    P12724.191                    ribonuclease A family member 3 [Source:HGNC Symbol;Acc:HGNC:10046]
## ENSG00000223609 ENSG00000223609     HBD     E9PFT6.63                           hemoglobin subunit delta [Source:HGNC Symbol;Acc:HGNC:4829]
## ENSG00000007944 ENSG00000007944   MYLIP    Q8WY64.169 myosin regulatory light chain interacting protein [Source:HGNC Symbol;Acc:HGNC:21155]
##                      logFC   AveExpr         t      P.Value  adj.P.Val         B
## ENSG00000148346 -0.9693081  8.914254 -7.411117 1.423487e-06 0.01841707 3.7564295
## ENSG00000124469 -0.9649413  9.087539 -6.975751 3.029591e-06 0.01959842 3.2898716
## ENSG00000164047 -0.8342927 10.282562 -6.550220 6.498199e-06 0.02802456 2.8018375
## ENSG00000173391 -0.7458480  7.697638 -6.054068 1.632259e-05 0.05279540 2.1914166
## ENSG00000132950  0.2515660  6.819214  5.748582 2.926057e-05 0.07571465 1.7930338
## ENSG00000206047 -1.0426848 11.519667 -5.604153 3.872615e-05 0.08350649 1.5987012
## ENSG00000164821 -0.6096806  7.585210 -5.522193 4.545724e-05 0.08401797 1.4867242
## ENSG00000169397 -0.3631733  7.325367 -5.106690 1.037699e-04 0.16782186 0.9004738
## ENSG00000223609 -0.9005301  9.539256 -4.883716 1.629694e-04 0.23427750 0.5734816
## ENSG00000007944  0.3466481 11.990352  4.733226 2.216880e-04 0.28681996 0.3481504
## Comparison of fold changes ##################################################
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
top.y <- topTable(fit2.berisha, coef="Post_vs_pre", sort.by = "none", n=Inf)
myDf <- merge(top.x, top.y, by="GENEID") 

## Total least squares regression 
v <- prcomp(myDf[, c("logFC.x", "logFC.y")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
eq <- as.expression(substitute(italic(y) == b %.% italic(x) + a, 
                               list(a = format(unname(int), digits = 2),
                                    b = format(unname(slp), digits = 2))))

## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes" 
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin" 
myDf$highlight[grep("^MT-", myDf$SYMBOL.x)] <- "Mitochondrial" 
table(myDf$highlight)
## 
## Erythrocytes   Hemoglobin  Neutrophils         NFKB 
##           13            9           11           15
## Sort data frame so that colored points are plotted last
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ], 
              myDf[!is.na(myDf$highlight), ])

f7p3 <- ggplot(myDf, aes(x=logFC.x, y=logFC.y)) + 
  ylab(bquote('log'[2]~'Fold-Change')) +
  xlab(bquote('log'[2]~'Fold-Change')) +
  geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
  geom_density_2d(col="grey30") +
  
  ## Correlation coefficient
  stat_cor(label.x = -2.12, label.y = 2.12, aes(label = after_stat(r.label)), size=7, hjust=0) + 
  ## Regression line slope and intercept 
  geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
  annotate(geom="text", x=-2.12, y=1.8, 
           label=eq, parse = TRUE, # See above
           size=7, hjust=0, vjust=0) +

  ## Fix limits so that range is equal on x and y axes
  xlim(c(-2.12, 2.12)) +
  ylim(c(-2.12, 2.12)) +
  scale_color_manual(name="",
    limits=sort(unique(myDf$highlight)),
    values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
    na.value = "grey80") +
  scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
  scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
  annotate(geom = "text", x=-2.12, y=0, label="6-12 months post vs. pre-surgery (Berisha et al.)", angle = 90, size=5, fontface = 'italic') +
  annotate(geom = "text", x=0, y=-2.12, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
  myTheme 
f7p3